1 Intro

This Notebook builds on the poll_final sf dataframe that we built in the first part of this project. poll_final contains the poll results by party and the geometry for each poll for the 2015 Federal Elections of 2015.

We will use the cancensus package to download sociodemographic data and geometry from the 2016 Canadian Census and plot the relationship between education and the results of the three main parties (libéral, conservateur and npd).

This notebook uses the tidyverse for wranling and plotting data, the cancensus package to download the Canadian Census data, the lwgeom package for making polygons “valid” and the `sf package for finding their intersections.

Disclaimer: I am not a poticial science expert, I just like messing around with data when my kids wake me up at night. The results here may be common knowledge already, my assumptions may be extremely wrong and my code could just be bugged.

load("poll_final.rdata") #results and geometry
library(sf)
library(tidyverse)
library(cancensus) #newly on CRAN!
library(lwgeom) #for st_make_valid
# remember to set options(cancensus.api_key) 
# and options(cancensus.cache_path)
# in ~/.Rprofile

colors <- c("libéral" = "#D71920", 
          "conservateur" = "#1A4782", 
          "npd" = "#F37021")

2 Code

As usual, the code for this post is available on my GitHub repo and is then hosted on my Hugo blog generated by blogdown and hosted on github/netlify. # Functions

I created three functions that wrap around the cancensus package.
get_cancensus_pct takes a parent vector and a generation level (first or last) for the children vectors and return a dataframe showing the percentage each child vectors represents at the geographic level specified.
get_cancensus_labels takes a parent vector and a generation level (first or last) for the children vectors and return the labels for the parent and children vectors.
get_cancensus_data build on these two functions. It takes a vector of parent vectors names and a generation level and returns a list of three objectS:

  • a dataframe with the percentages
  • a dictionnary showing which child are contained in each parent vector
  • a list of the parent vectors.

Finally, the wrapper the wrapper function automatically add lin changes to strings, allowing titles to fit inside ggplot. I borrowed from the internet a long time ago.


# the wrapper function automatically add lin changes to strings, allowing titles
# to fit inside ggplot
wrapper <- function(x, ...) 
{
  paste(strwrap(x, ...), collapse = "\n")
}

#this function takes a parent vector and a generation level (first or last) for
# child vectors and return the percentage each child vectors represents
get_cancensus_pct <- function(parentvar= "v_CA16_5051", 
                              generation = "first",
                              dataset="CA16", 
                              filterlevel = "PR",
                              filterregion = "10",
                              datalevel= "CSD"){
  
  mytotal <- search_census_vectors("", dataset) %>% 
    filter(vector == parentvar)
  
  if (generation == "first"){
    mychildren <- list_census_vectors(dataset) %>% 
      inner_join( mytotal  %>% select(parent_vector = vector))
  } else {
    mychildren <- mytotal %>%
      child_census_vectors(leaves_only =TRUE) 
  }
  
  # We'll need the aggregated total for our calculations so let's append them together
  myvectors <- bind_rows(mytotal, mychildren) %>%
    pull(vector)
  
  myvectorschildren <- mychildren %>% pull(vector)
  
  regions_list <- list_census_regions(dataset) %>% 
    filter(level==filterlevel,   region == filterregion) %>%
    as_census_region_list
  
  temp <- get_census(dataset, 
                       level = datalevel, 
                       regions = regions_list, 
                       vectors = myvectorschildren, 
                       geo_format = "sf", 
                       labels = "short") #%>%
    
  census <- temp %>% 
    left_join( temp %>%  
                 as.data.frame() %>%  #rowSums doesnt work on sf dataframe
                 select(GeoUID, one_of(myvectorschildren)) %>%
                 mutate("total" = rowSums( .[,myvectorschildren])) %>%
                 select(-one_of(myvectorschildren)))
    
    
  
  pct <- census %>% 
    mutate_at(.vars = myvectorschildren, funs( ifelse(total>0,100 * . /total, NA ) )) %>%
    select(-total) #%>%
  #setNames( str_replace(names(.), "v_", "pct_v_"))
  
}

#this functions 
get_cancensus_labels <- function(parentvar= "v_CA16_5051", 
                                 generation = "first", 
                                 dataset="CA16"){
  print(parentvar)
  mytotal <- search_census_vectors("", dataset) %>% 
    filter(vector == parentvar)
  
  if (generation == "first"){
    mychildren <- list_census_vectors(dataset) %>% 
      inner_join( mytotal  %>% select(parent_vector = vector))
  } else {
    mychildren <- mytotal %>%
      child_census_vectors(leaves_only =TRUE) 
  }
  
  regions_list <- list_census_regions(dataset) %>% 
    filter(level=="C") %>%
    as_census_region_list
  
  # We'll need the aggregated total for our calculations so let's append them together
  myvectorstotal <- mytotal  %>%   pull(vector)
  myvectorschildren <- mychildren %>% pull(vector)
  
  total_census <- get_census(dataset, 
                             level = "C", 
                             regions = regions_list, 
                             vectors = myvectorstotal, 
                             geo_format = NA, 
                             labels = "short") 
  total_label <- label_vectors(total_census) 
  
  if (count(mychildren)>0){ 
    children_census <- get_census(dataset, 
                                  level = "C", 
                                  regions = regions_list, 
                                  vectors = myvectorschildren, 
                                  geo_format = NA, 
                                  labels = "short") 
    
    children_label <- label_vectors(children_census) 
    
    children_label$Parent_Vector <- total_label$Vector
    children_label$Parent_Detail <- total_label$Detail
    
    labels <- children_label %>% 
      select(Parent_Vector, Parent_Detail, Vector, Detail)%>%
      arrange(Vector)
    
  } else { #prevent a bug if parent vector has no child
    labels <- total_label %>% rename(Parent_Vector = Vector, Parent_Detail = Detail) %>%
      mutate(Vector = "", Detail = "") %>%
      arrange(Vector)
  }
  return(labels)
}

# this function  takes a vector of parent vectors names and a generation level
# and returns a list of three objectS: 
# - a dataframe with the percentages
# - a dictionnary showing which child are contained in each parent vector
# - a list of the parent vectors

get_cancensus_data <- function(parents= c("v_CA16_451"),
                                generation="first",
                                dataset = "CA16",
                                filterlevel= "PR",
                                filterregion = "10",
                                datalevel = "CSD"){
  parents <- parents %>% sort()
  p1 <- map(parents[1], 
            ~ get_cancensus_pct(parentvar= .x, 
                                generation= generation,
                                dataset=dataset, 
                                filterlevel = filterlevel,
                                filterregion = filterregion,
                                datalevel= datalevel) %>%
              select(GeoUID, Type, `Region Name`, `Area (sq km)`,
                     Population,Dwellings,Households,order(colnames(.))))
  if (length(parents >=2)){
    p2plus <- map(tail(parents,-1 ), 
                  ~ get_cancensus_pct(parentvar= .x, 
                                      generation = generation,
                                      dataset=dataset, 
                                      filterlevel = filterlevel,
                                      filterregion = filterregion,
                                      datalevel= datalevel) %>%
                    select(starts_with("v_")) %>%
                    select(order(colnames(.)))) 
    pourcentages <- bind_cols(p1,p2plus)
  } else pourcentages <- bind_cols(p1)
  
  
  dictionnaire <- map_df(parents,~ get_cancensus_labels(parentvar =.x, 
                                                        generation = generation,
                                                        dataset=dataset )) 
  
  variables <- dictionnaire %>% distinct(Parent_Vector, Parent_Detail)
  
  return(list(pourcentages, dictionnaire, variables))
}

3 Code snippets I will be coming back to this script for

  • My three cancensus functions.
  • example of st_make_valid() and st_intersection()
  • combining purrr and ggplot using map2(data,..)

4 Getting the census data and applying it to the poll sections

Getting the census data and shapefile is pretty straightforward when done using the cancensus package. “v_CA16_5051” is the highest level of schooling.

We download the census data at the Dissemination Area (DA) level, which is the smallest geographical level for which Statistics Canada will release data. Its population is between 400-700 persons.

parents <- c("v_CA16_5051")
census_extract <- get_cancensus_data(parents,
                                   generation="last",
                                   dataset = "CA16",
                                   filterlevel = "C",
                                   filterregion = "01",
                                   datalevel = "DA")
## [1] "v_CA16_5051"
census_data <- census_extract[[1]]
census_dict <- census_extract[[2]]
census_vars <- census_extract[[3]]

mytotal <- search_census_vectors("", "CA16") %>% 
    filter(vector == "v_CA16_5051")
mychildren <- mytotal %>% child_census_vectors(leaves_only =TRUE) 
myvectorschildren <- mychildren %>% pull(vector)

Once we have the DA data, we find the intersection between the DA and the poll sections.
Lines that should be identical in both files are not always 100% accurate, which leads to small intersection between polygons that should not intersect. To prevent this, I ignore intersections that represent less than 10% of either the DA or poll polygon.

#make geometries valid to prevent error in st_intersection
# Error in CPL_geos_op2(op, st_geometry(x), st_geometry(y)) : 
  #Evaluation error: TopologyException: Input geom 1 is invalid: Ring Self-intersection at or near point #-77.249958667496159 48.385304531338512 at -77.249958667496159 48.385304531338512.

census_data <-  census_data %>% st_make_valid()
poll_final <- poll_final %>% st_make_valid()

intersection <- st_intersection(census_data  , 
                                poll_final %>% 
                                  st_transform(st_crs(census_data))) 
save(intersection,file="intersection.rdata")
# add in areas in m2
attArea <-intersection %>% 
  mutate(area = st_area(.) %>% as.numeric())

#only keep areas if they represent 10% of either DA or poll. 
# this is because lines that should be identical in both files are not always
# 100% accurate.

attArea_filtered <- attArea %>%  as_tibble() %>%
  group_by(FED_NUM, EMRP_NAME) %>% 
  mutate(pct_poll  = area / sum(area)) %>%
  ungroup() %>%
  group_by(GeoUID) %>%
  mutate(pct_ad = area / sum(area)) %>%
  ungroup() %>%
  filter(pct_poll >= 0.10 | pct_ad >= 0.10) %>% # 10% de la poll ou 10% de DA %>%
  group_by(FED_NUM, EMRP_NAME) %>%  # get new pct after dropping small areas
  mutate(pct_poll  = area / sum(area)) %>%
  ungroup() %>%
  group_by(GeoUID) %>%
  mutate(pct_ad = area / sum(area)) %>%
  ungroup()

We then distribute the DA population to the polls. We make three important assumptions:

  • elector population is proportional to total populattion (we should use population aged 18+ at the very least)
  • DA population is uniformly distributed on the whole area
  • Participation rate is equal among all DA that touch a poll

To make analysis easier, we group the highest levels of schooling in three groups:

  • No high school diploma
  • High school, college, trade or university certificate below bachelor
  • University (bachelor degree or above)

## distribute DA population assuming that 
## 1 ) elector population is proportional to total populattion (we should check for age 18+ at the very least)
## 2) DA population is uniformly distributed
## 3) participation rate is equal among all DA that touch a poll
poll_demo <- attArea_filtered %>%
   as.data.frame %>% select(-geometry) %>%
  group_by(FED_NUM, EMRP_NAME) %>%
  summarise_at(vars(myvectorschildren),
            funs( sum(. * pct_ad * Population, na.rm = T)/ 
                    sum(pct_ad * Population, na.rm = T)))
  
poll_demo  <- poll_demo  %>% mutate(
  pct_nohs = v_CA16_5054,
  pct_hs_college_trade =  v_CA16_5057+ v_CA16_5066 + v_CA16_5069 + 
    v_CA16_5072 + v_CA16_5075,
  pct_uni = v_CA16_5081 + v_CA16_5084 + v_CA16_5087 + v_CA16_5090 + v_CA16_5093)

complete_data <- poll_final %>%
  inner_join(poll_demo) %>%
  mutate(PR = floor(FED_NUM / 1000))
save(complete_data, file = "complete_data.rdata")

5 Results

Finally, we make scatter plots showing the relationship between the level of schooling of the poll and the results of the three main parties. Each dot represents one polling station. I have broken down the poll between the 10 provinces and the 3 territories to see if the trends hold between different areas.

Province codes:

  • 10 Newfoundland and Labrador
  • 11 Prince Edward Island
  • 12 Nova Scotia
  • 13 New Brunswick
  • 24 Quebec
  • 35 Ontario
  • 46 Manitoba
  • 47 Saskatchewan
  • 48 Alberta
  • 59 British Columbia
  • 60 Yukon
  • 61 Northwest Territories
  • 62 Nunavut

Results are not uniform across provinces, but I spotted the following trends outside of the Atlantic provinces (NFL, PEI, NS and NB).


load("complete_data.rdata")
plot_data <- complete_data%>% 
  select(pollid, libéral, conservateur, npd, pct_nohs, pct_hs_college_trade, pct_uni, PR) %>%
  gather(key="parti",value="pct_vote", libéral, conservateur, npd) %>%
  gather(key="educ",value="pct_educ", pct_nohs, pct_hs_college_trade, pct_uni) %>% 
  mutate(educ = as.factor(educ) %>% 
           fct_relevel("pct_nohs", "pct_hs_college_trade", "pct_uni"),
         parti = as.factor(parti) %>%
           fct_relevel("libéral", "conservateur", "npd"),
         PR = as.factor(PR) %>%
                          fct_recode(
                            "NFL" = "10",
                            "PEI" = "11",
                            "NS" = "12",
                            "NB" = "13",
                            "QC" = "24",
                            "ON" = "35",
                            "MB" = "46",
                            "SK" = "47",
                            "AB" = "48",
                            "BC" = "59",
                            "YK" = "60",
                            "NWT" = "61",
                            "NVT" = "62"
                          ))

A high share of population with no high school will help the NPD in most provinces outside of the Atlantic.

plot_data %>% filter(educ == "pct_nohs") %>%
   ggplot()+
                     geom_point(aes(x=pct_educ, y= pct_vote, color= parti), alpha= 0.1)+
                     geom_smooth(aes(x=pct_educ, y= pct_vote), color = "black", se = TRUE)+
                     facet_grid( parti~ PR) +
                     theme_bw()+ 
  ggtitle(wrapper(paste0("Relationship between the percentage of vote received by a party at a poll section during the 2015 federal elections and the percentage of the  poll section with no high school degree "), width=70))+ 
    scale_colour_manual(values = colors)+
  xlab("Percentage of population with this education level in the poll section") +
  ylab("Percentage of votes earned by party") 

The conservative party appears to thrive in poll sections with a high proportion of “middle level” education such as high school or college degrees, trade certificates outside of the Atlantic.


plot_data %>% filter(educ == "pct_hs_college_trade") %>%
   ggplot()+
                     geom_point(aes(x=pct_educ, y= pct_vote, color= parti), alpha= 0.1)+
                     geom_smooth(aes(x=pct_educ, y= pct_vote), color = "black", se = TRUE)+
                     facet_grid( parti~ PR) +
                     theme_bw()+ 
  ggtitle(wrapper(paste0("Relationship between the percentage of vote received by a party at a poll section during the 2015 federal elections and the percentage of the  poll section with  either a high school, trades, college degree or university certificate"), width=70))+ 
    scale_colour_manual(values = colors)+
  xlab("Percentage of population with this education level in the poll section") +
  ylab("Percentage of votes earned by party") 

A high share of university degrees typically helps the Libéral, except in the Atlantic, where it will favour the NPD. It never helps the conservative party.



plot_data %>% filter(educ == "pct_uni") %>%
   ggplot()+
                     geom_point(aes(x=pct_educ, y= pct_vote, color= parti), alpha= 0.1)+
                     geom_smooth(aes(x=pct_educ, y= pct_vote), color = "black", se = TRUE)+
                     facet_grid( parti~ PR) +
                     theme_bw()+ 
  ggtitle(wrapper(paste0("Relationship between the percentage of vote received by a party at a poll section during the 2015 federal elections and the percentage of the  poll section with a bachelor degree or above"), width=70))+ 
    scale_colour_manual(values = colors)+
  xlab("Percentage of population with this education level in the poll section") +
  ylab("Percentage of votes earned by party") 

plots <- plot_data %>%
  group_by(PR) %>%
  nest() #%>%

z <- plots %>%  mutate(plot = map2(data, PR, ~
                     ggplot(data = .x)+
                     geom_point(aes(x=pct_educ, y= pct_vote, color= parti), alpha= 0.1)+
                     geom_smooth(aes(x=pct_educ, y= pct_vote), color = "black", se = TRUE)+
                     facet_grid( parti~ educ) +
                     theme_bw()+ 
  ggtitle(wrapper(paste0("Relationship between the percentage of vote received by a party at a poll section during the 2015 federal elections and the education level of the poll section according to the 2016 Census for the province of ", .y), width=70))+ 
    scale_colour_manual(values = colors)+
  xlab("Percentage of population with this education level in the poll section") +
  ylab("Percentage of votes earned by party") ))

map2(paste0(z$PR, ".png"), z$plot, ggsave)